Manage
Create a crawler
Creates a new crawler with the provided configuration.
POST
/
1
/
crawlers
Copy
Ask AI
curl --request POST \
--url https://crawler.algolia.com/api/1/crawlers \
--header 'Authorization: Basic <encoded-value>' \
--header 'Content-Type: application/json' \
--data '{
"name": "test-crawler",
"config": {
"actions": [
{
"autoGenerateObjectIDs": true,
"cache": {
"enabled": true
},
"discoveryPatterns": [
"https://www.algolia.com/**"
],
"fileTypesToMatch": [
"html",
"pdf"
],
"hostnameAliases": {
"dev.example.com": "example.com"
},
"indexName": "algolia_website",
"name": "<string>",
"pathAliases": {
"example.com": {
"/foo": "/bar"
}
},
"pathsToMatch": [
"https://www.algolia.com/**"
],
"recordExtractor": {
"__type": "function",
"source": "<string>"
},
"schedule": "<string>",
"selectorsToMatch": [
".products",
"!.featured"
]
}
],
"apiKey": "<string>",
"appId": "<string>",
"exclusionPatterns": [
"https://www.example.com/excluded",
"!https://www.example.com/this-one-url",
"https://www.example.com/exclude/**"
],
"externalData": [
"testCSV"
],
"extraUrls": [
"<string>"
],
"ignoreCanonicalTo": true,
"ignoreNoFollowTo": true,
"ignoreNoIndex": true,
"ignorePaginationAttributes": true,
"ignoreQueryParams": [
"ref",
"utm_*"
],
"ignoreRobotsTxtRules": true,
"indexPrefix": "crawler_",
"initialIndexSettings": {},
"linkExtractor": {
"__type": "function",
"source": "({ $, url, defaultExtractor }) => {\n if (/example.com\\/doc\\//.test(url.href)) {\n // For all pages under `/doc`, only extract the first found URL.\n return defaultExtractor().slice(0, 1)\n }\n // For all other pages, use the default.\n return defaultExtractor()\n}\n"
},
"login": {
"url": "https://example.com/secure/login-with-post",
"requestOptions": {
"method": "POST",
"headers": {
"Content-Type": "application/x-www-form-urlencoded"
},
"body": "id=my-id&password=my-password",
"timeout": 5000
}
},
"maxDepth": 5,
"maxUrls": 250,
"rateLimit": 4,
"renderJavaScript": true,
"requestOptions": {
"proxy": "<string>",
"timeout": 30000,
"retries": 3,
"headers": {
"Accept-Language": "fr-FR",
"Authorization": "Bearer Aerehdf==",
"Cookie": "session=1234"
}
},
"safetyChecks": {
"beforeIndexPublishing": {
"maxLostRecordsPercentage": 10,
"maxFailedUrls": 123
}
},
"saveBackup": true,
"schedule": "every weekday at 12:00 pm",
"sitemaps": [
"https://example.com/sitemap.xyz"
],
"startUrls": [
"https://www.example.com"
]
}
}'
Copy
Ask AI
{
"id": "e0f6db8a-24f5-4092-83a4-1b2c6cb6d809"
}
Authorizations
Basic authentication header of the form Basic <encoded-value>
, where <encoded-value>
is the base64-encoded string username:password
.
Body
application/json
Response
200
application/json
OK
The response is of type object
.
Was this page helpful?
Copy
Ask AI
curl --request POST \
--url https://crawler.algolia.com/api/1/crawlers \
--header 'Authorization: Basic <encoded-value>' \
--header 'Content-Type: application/json' \
--data '{
"name": "test-crawler",
"config": {
"actions": [
{
"autoGenerateObjectIDs": true,
"cache": {
"enabled": true
},
"discoveryPatterns": [
"https://www.algolia.com/**"
],
"fileTypesToMatch": [
"html",
"pdf"
],
"hostnameAliases": {
"dev.example.com": "example.com"
},
"indexName": "algolia_website",
"name": "<string>",
"pathAliases": {
"example.com": {
"/foo": "/bar"
}
},
"pathsToMatch": [
"https://www.algolia.com/**"
],
"recordExtractor": {
"__type": "function",
"source": "<string>"
},
"schedule": "<string>",
"selectorsToMatch": [
".products",
"!.featured"
]
}
],
"apiKey": "<string>",
"appId": "<string>",
"exclusionPatterns": [
"https://www.example.com/excluded",
"!https://www.example.com/this-one-url",
"https://www.example.com/exclude/**"
],
"externalData": [
"testCSV"
],
"extraUrls": [
"<string>"
],
"ignoreCanonicalTo": true,
"ignoreNoFollowTo": true,
"ignoreNoIndex": true,
"ignorePaginationAttributes": true,
"ignoreQueryParams": [
"ref",
"utm_*"
],
"ignoreRobotsTxtRules": true,
"indexPrefix": "crawler_",
"initialIndexSettings": {},
"linkExtractor": {
"__type": "function",
"source": "({ $, url, defaultExtractor }) => {\n if (/example.com\\/doc\\//.test(url.href)) {\n // For all pages under `/doc`, only extract the first found URL.\n return defaultExtractor().slice(0, 1)\n }\n // For all other pages, use the default.\n return defaultExtractor()\n}\n"
},
"login": {
"url": "https://example.com/secure/login-with-post",
"requestOptions": {
"method": "POST",
"headers": {
"Content-Type": "application/x-www-form-urlencoded"
},
"body": "id=my-id&password=my-password",
"timeout": 5000
}
},
"maxDepth": 5,
"maxUrls": 250,
"rateLimit": 4,
"renderJavaScript": true,
"requestOptions": {
"proxy": "<string>",
"timeout": 30000,
"retries": 3,
"headers": {
"Accept-Language": "fr-FR",
"Authorization": "Bearer Aerehdf==",
"Cookie": "session=1234"
}
},
"safetyChecks": {
"beforeIndexPublishing": {
"maxLostRecordsPercentage": 10,
"maxFailedUrls": 123
}
},
"saveBackup": true,
"schedule": "every weekday at 12:00 pm",
"sitemaps": [
"https://example.com/sitemap.xyz"
],
"startUrls": [
"https://www.example.com"
]
}
}'
Copy
Ask AI
{
"id": "e0f6db8a-24f5-4092-83a4-1b2c6cb6d809"
}
Assistant
Responses are generated using AI and may contain mistakes.